suppressPackageStartupMessages(library(plyr))
suppressWarnings(suppressMessages(library(dplyr)))
suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'tidyr' was built under R version 3.4.2
## Warning: package 'purrr' was built under R version 3.4.2
suppressWarnings(library(tidyverse))
knitr::opts_chunk$set(fig.width=13, fig.height=8)
suppressWarnings(suppressMessages(library(knitr)))
suppressWarnings(suppressMessages(library(kableExtra)))
suppressWarnings(options(knitr.table.format = "markdown"))
library(readr)
suppressWarnings(suppressMessages(library(forcats)))
#install.packages("gdata")
suppressWarnings(suppressMessages(library(gdata)))
# install.packages("RColorBrewer")
library(RColorBrewer)
Vincenzo Coia has approved my request to use published genomic data for the next assignments, therefore, I want to provide a few clarifications:
Thibodeau, M. L. et al. Genomic profiling of pelvic genital type leiomyosarcoma in a woman with a germline CHEK2:c.1100delC mutation and a concomitant diagnosis of metastatic invasive ductal breast carcinoma. Cold Spring Harb Mol Case Stud mcs.a001628 (2017). doi:10.1101/mcs.a001628
Open access article and data available here
chek2_rna_cnv <- read.table("/Users/mylinh/Desktop/chek2-data-trial-stat545/chek2-rna-expression-cnv-data.txt", sep="\t", strip.white = TRUE, header = TRUE)
#View(chek2_rna_cnv)
# class(chek2_rna_cnv)
# typeof(chek2_rna_cnv)
summary(chek2_rna_cnv) %>% kable(format = "markdown", align="c")
| chr | start | end | strand | cytoband | Ensembl.gene.ID | hugo | copy.category | copy.change | avg.cna | avg.cna.by.gene | breakpoint | manually.curated.homd | loh | loh_ratio | intepreted.expression.status | RPKM | SARC.percentile | SARC.kIQR | FC.mean.Bodymap | avg.TCGA.percentile | avg.TCGA.kIQR | avg.TCGA.norm.percentile | avg.TCGA.norm.kIQR | UCEC.norm.percentile | UCEC.norm.kIQR | cancer.gene.type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 : 2042 | Min. : 5810 | Min. : 31427 | Min. :-1.00000 | p13.3 : 620 | ENSG00000000003: 1 | : 356 | : 34 | Min. :-2.00000 | Min. :-0.45000 | Min. :-0.47000 | Min. :0.00000 | Min. :0.00000 | : 92 | Min. :0.1800 | :18932 | Min. : 0.00 | Min. : 0.00 | Min. :-2.30 | Min. :-893.100 | Min. : 0.00 | Min. :-2.00 | Min. : 0.00 | Min. :-3.48 | Min. : 0.00 | Min. :-8.31 | :18335 | |
| 19 : 1421 | 1st Qu.: 31494320 | 1st Qu.: 31525315 | 1st Qu.:-1.00000 | q22.1 : 489 | ENSG00000000005: 1 | AGAP9 : 2 | Gain : 1092 | 1st Qu.: 0.00000 | 1st Qu.: 0.01000 | 1st Qu.: 0.00000 | 1st Qu.:0.00000 | 1st Qu.:0.00000 | DLOH: 1 | 1st Qu.:0.4700 | down: 460 | 1st Qu.: 0.23 | 1st Qu.: 27.00 | 1st Qu.:-0.37 | 1st Qu.: -1.480 | 1st Qu.: 23.00 | 1st Qu.:-0.40 | 1st Qu.: 22.00 | 1st Qu.:-0.42 | 1st Qu.: 8.00 | 1st Qu.:-0.83 | putative tumour suppressor : 667 | |
| 11 : 1281 | Median : 57837876 | Median : 57882616 | Median : 1.00000 | q13.2 : 433 | ENSG00000000419: 1 | CT45A5 : 2 | Homozygous Loss: 79 | Median : 0.00000 | Median : 0.01000 | Median : 0.02000 | Median :0.00000 | Median :0.00000 | HET :19503 | Median :0.5000 | up : 224 | Median : 3.17 | Median : 51.00 | Median : 0.08 | Median : -1.060 | Median : 47.00 | Median :-0.01 | Median : 48.00 | Median : 0.00 | Median : 42.00 | Median :-0.18 | oncogene : 303 | |
| 2 : 1224 | Mean : 74255388 | Mean : 74321328 | Mean : 0.01393 | p13.2 : 425 | ENSG00000000457: 1 | DCDC1 : 2 | Loss : 2147 | Mean :-0.05834 | Mean : 0.00147 | Mean : 0.00926 | Mean :0.00123 | Mean :0.00405 | HOMD: 14 | Mean :0.5027 | NA | Mean : 22.74 | Mean : 50.92 | Mean : Inf | Mean : -1.208 | Mean : 47.83 | Mean : Inf | Mean : 48.95 | Mean : Inf | Mean : 43.87 | Mean : Inf | tumour suppressor : 128 | |
| 17 : 1159 | 3rd Qu.:111340848 | 3rd Qu.:111383246 | 3rd Qu.: 1.00000 | q11.2 : 366 | ENSG00000000460: 1 | DIO3 : 2 | Neutral :16206 | 3rd Qu.: 0.00000 | 3rd Qu.: 0.02000 | 3rd Qu.: 0.04000 | 3rd Qu.:0.00000 | 3rd Qu.:0.00000 | NLOH: 6 | 3rd Qu.:0.5400 | NA | 3rd Qu.: 9.42 | 3rd Qu.: 76.00 | 3rd Qu.: 0.77 | 3rd Qu.: 1.200 | 3rd Qu.: 73.00 | 3rd Qu.: 0.66 | 3rd Qu.: 76.00 | 3rd Qu.: 0.73 | 3rd Qu.: 79.00 | 3rd Qu.: 0.84 | putative oncogene : 114 | |
| 3 : 1057 | Max. :249200395 | Max. :249214145 | Max. : 1.00000 | q21.3 : 348 | ENSG00000000938: 1 | DTX2 : 2 | No Data : 58 | Max. : 2.00000 | Max. : 0.42000 | Max. : 0.42000 | Max. :1.00000 | Max. :1.00000 | NA | Max. :0.9000 | NA | Max. :34198.14 | Max. :100.00 | Max. : Inf | Max. : 121.580 | Max. :100.00 | Max. : Inf | Max. :100.00 | Max. : Inf | Max. :100.00 | Max. : Inf | oncogene; putative tumour suppressor: 46 | |
| (Other):11432 | NA’s :92 | NA’s :92 | NA’s :92 | (Other):16935 | (Other) :19610 | (Other):19250 | NA | NA’s :92 | NA’s :92 | NA’s :92 | NA’s :92 | NA’s :92 | NA | NA’s :92 | NA | NA | NA’s :1347 | NA’s :2454 | NA | NA’s :760 | NA’s :1867 | NA’s :1347 | NA’s :2331 | NA’s :1347 | NA’s :2438 | (Other) : 23 |
dim(chek2_rna_cnv)
## [1] 19616 27
Note 1. I tried to use the functions from readr (read_table(), read_table2 and read_tsv()), but the column names with spaces (e.g. Ensembl gene ID) were not converted to column names with dot separation like it is done in read.table (E.g. Ensembl gene ID -> Ensembl.gene.ID.), which messed up the full data frame. I had error messages when trying to use dget() and dput. Therefore, why change a functional approach: I decided to keep read.table to do this homework.
chek2_rna_cnv <- with(chek2_rna_cnv, chek2_rna_cnv[!(chr==""),])
chek2_rna_cnv <- with(chek2_rna_cnv, chek2_rna_cnv[!(start==""),])
chek2_rna_cnv <- with(chek2_rna_cnv, chek2_rna_cnv[!(end==""),])
chek2_rna_cnv <- with(chek2_rna_cnv, chek2_rna_cnv[!(Ensembl.gene.ID==""),])
chek2_rna_cnv <- with(chek2_rna_cnv, chek2_rna_cnv[!(hugo==""),])
chek2_rna_cnv$cancer.gene.type <- as.character(chek2_rna_cnv$cancer.gene.type)
chek2_rna_cnv$cancer.gene.type[chek2_rna_cnv$cancer.gene.type==""] <- "NA"
chek2_rna_cnv$cancer.gene.type <- as.factor(chek2_rna_cnv$cancer.gene.type)
gene_types <- levels(chek2_rna_cnv$cancer.gene.type)
chek2_rna_cnv <- chek2_rna_cnv %>%
group_by(hugo) %>%
slice(1L)
chek2_rna_cnv <- ungroup(chek2_rna_cnv)
dim(chek2_rna_cnv)
## [1] 19182 27
Note 1. You might have noticed that I manipulated the cancer.gene.type variable to change it to a character in order to replace the cancer.gene.type empty cells by “NA”, then converted back to a factor, which partially answers exercise (1A) (see below)
Note 2. I had trouble with further manipulations on the data.frame because its class was grouped (“grouped_df” “tbl_df” “tbl” “data.frame”) so I tried to use the ungroup() function for some code chunks as shown here.
This will become handy for tables and plots, you’ll see:
abbreviations_gene_types <- factor(c("unknown", "ONC", "ONC.pTS", "ONC.TS", "pONC", "pONC.pTS", "pONC.TS", "pTS", "TS"))
abbreviations_gene_types <- as.character(abbreviations_gene_types)
abbr_table <- data.frame(Abbreviation = abbreviations_gene_types, cancer.gene.type = gene_types)
abbr_table %>% kable(format = "markdown", align="c")
| Abbreviation | cancer.gene.type |
|---|---|
| unknown | NA |
| ONC | oncogene |
| ONC.pTS | oncogene; putative tumour suppressor |
| ONC.TS | oncogene; tumour suppressor |
| pONC | putative oncogene |
| pONC.pTS | putative oncogene; putative tumour suppressor |
| pONC.TS | putative oncogene; tumour suppressor |
| pTS | putative tumour suppressor |
| TS | tumour suppressor |
chek2_rna_cnv$cancer.gene.type <- as.character(chek2_rna_cnv$cancer.gene.type)
chek2_rna_cnv$cancer.gene.type <- gsub("NA", "unknown", chek2_rna_cnv$cancer.gene.type)
chek2_rna_cnv$cancer.gene.type <- gsub("oncogene", "ONC", chek2_rna_cnv$cancer.gene.type)
chek2_rna_cnv$cancer.gene.type <- gsub("tumour suppressor", "TS", chek2_rna_cnv$cancer.gene.type)
chek2_rna_cnv$cancer.gene.type <- gsub("putative", "p", chek2_rna_cnv$cancer.gene.type)
chek2_rna_cnv$cancer.gene.type <- as.vector(chek2_rna_cnv$cancer.gene.type)
chek2_rna_cnv$cancer.gene.type <- gsub("; ", ".", chek2_rna_cnv$cancer.gene.type)
chek2_rna_cnv$cancer.gene.type <- gsub(" ", "", chek2_rna_cnv$cancer.gene.type)
chek2_rna_cnv$cancer.gene.type <- as.factor(chek2_rna_cnv$cancer.gene.type)
OBJECTIVES:
Note 1. I searched several online dictionaries, and I still do not understand the meaning of “principled”: the online dictionaries are giving me answers like “based on moral rules” and “(of a system or method) based on a given set of rules”, so I will conclude that if my approach is coherent and follow a logical path, it is principled.
Note 2. I have temporarily “broken” (messed up) my original data frame (chek2_rna_cnv) several times in the course of this homework, so I will store it in a variable d0 to use in data manipulation just in case I make mistakes.
d0 <- chek2_rna_cnv
Transform some of the variable of chek2_rna_cnv into factors. In cleaning up the dataset, we converted a factors column into characters then back to factors.
Let’s see what classes of data we have in our data frame:
sapply(d0, class) %>% kable(format = "markdown", align="c")
## Warning in kable_markdown(x = structure(c("chr", "start", "end",
## "strand", : The table should have a header (column names)
| chr | factor |
| start | integer |
| end | integer |
| strand | integer |
| cytoband | factor |
| Ensembl.gene.ID | factor |
| hugo | factor |
| copy.category | factor |
| copy.change | integer |
| avg.cna | numeric |
| avg.cna.by.gene | numeric |
| breakpoint | integer |
| manually.curated.homd | integer |
| loh | factor |
| loh_ratio | numeric |
| intepreted.expression.status | factor |
| RPKM | numeric |
| SARC.percentile | integer |
| SARC.kIQR | numeric |
| FC.mean.Bodymap | numeric |
| avg.TCGA.percentile | integer |
| avg.TCGA.kIQR | numeric |
| avg.TCGA.norm.percentile | integer |
| avg.TCGA.norm.kIQR | numeric |
| UCEC.norm.percentile | integer |
| UCEC.norm.kIQR | numeric |
| cancer.gene.type | factor |
Change the avg.TCGA.percentile (class integer) to factors with base R:
d0$avg.TCGA.percentile <- as.factor(d0$avg.TCGA.percentile)
d0 %>%
select(hugo,avg.TCGA.percentile) %>%
head(10) %>% kable(format = "markdown", align="c")
| hugo | avg.TCGA.percentile |
|---|---|
| A1BG | 85 |
| A1CF | 32 |
| A2M | 90 |
| A2ML1 | 70 |
| A4GALT | 62 |
| A4GNT | 43 |
| AAAS | 11 |
| AACS | 3 |
| AADAC | 75 |
| AADACL2 | 77 |
nlevels(d0$avg.TCGA.percentile)
## [1] 101
Change the avg.TCGA.percentile (class integer) to factors with forcats (let’s reset the classes first by “reinitializing” dataframe d0).
d0 <- chek2_rna_cnv
d0 <- with(d0, d0[!is.na(avg.TCGA.percentile),])
d0$avg.TCGA.percentile <- as.character(d0$avg.TCGA.percentile)
d0$avg.TCGA.percentile <- forcats::as_factor(d0$avg.TCGA.percentile)
d0 %>%
select(hugo,avg.TCGA.percentile) %>%
head(10) %>% kable(format = "markdown", align="c")
| hugo | avg.TCGA.percentile |
|---|---|
| A1BG | 85 |
| A1CF | 32 |
| A2M | 90 |
| A2ML1 | 70 |
| A4GALT | 62 |
| A4GNT | 43 |
| AAAS | 11 |
| AACS | 3 |
| AADAC | 75 |
| AADACL2 | 77 |
nlevels(d0$avg.TCGA.percentile)
## [1] 101
Note 1. The forcats package can only “transform” characters into factors, while base R can transform many classes to factors (characters, numeric and integer), that’s why I prefer base R for this specific initial task.
Note 2. Forcats does not like the fact that some values are not available (NA) in the avg.TCGA.percentile. Base R did not have problems with this, but in order to illustrate this function with forcats, I had to remove the rows for which avg.TCGA.percentile was not available).
In the end, I obtain the same result with base R and forcats with the avg.TCGA.percentile as factors, although I know that base R has a tendency to try re-ordering the data, which is not alway what we want.
Filter chek2_rna_cnv to remove all the avg.TCGA.percentile equal to 0 and drop unused levels.
So we are starting with 101 levels (0 to 100, inclusively)
d1 <- d0 %>%
filter(avg.TCGA.percentile != 0) %>%
arrange(avg.TCGA.percentile) %>%
select(hugo, avg.TCGA.percentile)
nlevels(d0$avg.TCGA.percentile)
## [1] 101
d_gdata <- gdata::drop.levels(d1)
nlevels(d_gdata$avg.TCGA.percentile)
## [1] 100
# length(levels(d_gdata$avg.TCGA.percentile)) # alternative method to count levels
Note 1. gdata is very slow, I would not recommend it !
Now let’s see how we can do this with base R.
nlevels(d0$avg.TCGA.percentile)
## [1] 101
d1_baseR <- d1 %>% droplevels()
nlevels(d1_baseR$avg.TCGA.percentile)
## [1] 100
And now let’s do the same with forcats.
nlevels(d0$avg.TCGA.percentile)
## [1] 101
d1$avg.TCGA.percentile %>%
fct_drop() %>%
nlevels()
## [1] 100
Note 1. In the example above, you can see that the number of levels is initially 101 and after removing the avg.TCGA.percentile equal to 0, the number of levels is 100, so I successfully dropped the unused level “0”.
Note 2. To do this, I used a function of the gdata package, as exemplified here
Use the forcats package to change the order of the factor levels, based on a principled summary of one of the quantitative variables. Consider experimenting with a summary statistic beyond the most basic choice of the median.
I will plot the RPKM value according to each hugo gene.
d0 <- chek2_rna_cnv
d0 %>%
arrange(RPKM) %>%
ggplot(aes(x=hugo, y=RPKM)) +
labs(x="hugo genes") +
theme(
plot.title= element_text(color = "grey44", size=24, face="bold"),
text = element_text(size=18),
axis.text.x = element_blank(),
axis.ticks.x=element_blank()) +
ggtitle("RPKM (gene expression) for each hugo gene") +
geom_point(aes(colour=cancer.gene.type), size = 2, alpha=0.9, shape=21)
Note 1. This is not very pretty, I would like to re-order the hugo gene according to their increasing value of RPKM.
Note 2. Using the arrange() function did not resolve this problem because the factors “hugo” are not re-arranged according to RPKM. We need additional functions to do so (see below for examples).
I will arrange the order of the factor hugo according to the ascending value of RPKM, as previously exemplified in my homework 3.
d0$hugo <- factor(d0$hugo, levels = d0$hugo[order(d0$RPKM)])
d0 %>%
arrange(RPKM) %>%
ggplot(aes(x=hugo, y=RPKM)) +
labs(x="hugo genes") +
theme(
plot.title= element_text(color = "grey44", size=24, face="bold"),
text = element_text(size=18),
axis.text.x = element_blank(),
axis.ticks.x=element_blank()) +
ggtitle("Ascending RPKM (gene expression) for each hugo gene") +
geom_point(aes(colour=cancer.gene.type), size = 2, alpha=0.9, shape=21)
I will start by changing the lcass of avg.TCGA.percentile back to a numeric so that I can do a mean function on it. Then, I will show the difference between ordered and un-ordered factors cancer.gene.type according to the mean.
d0$avg.TCGA.percentile <- as.numeric(d0$avg.TCGA.percentile)
mean_per_type <- d0 %>%
filter(avg.TCGA.percentile != 0) %>%
arrange(avg.TCGA.percentile) %>%
group_by(cancer.gene.type) %>%
dplyr::summarize(mean.percentile.by.type = mean(avg.TCGA.percentile))
mean_per_type %>% kable(format = "markdown", align="c")
| cancer.gene.type | mean.percentile.by.type |
|---|---|
| ONC | 49.04714 |
| ONC.pTS | 41.17778 |
| ONC.TS | 40.66667 |
| pONC | 44.84821 |
| pONC.pTS | 46.14286 |
| pONC.TS | 21.33333 |
| pTS | 50.62422 |
| TS | 42.50781 |
| unknown | 47.96121 |
# cancer.gene.type factors in alphabetical order
unordered_plot <- mean_per_type %>%
ggplot(aes(x=cancer.gene.type, y = mean.percentile.by.type, colour=cancer.gene.type)) +
ggtitle("Mean percentile for each cancer.gene.type factor") +
geom_point(size=6) + theme(
plot.title= element_text(color = "grey44", size=24, face="bold"),
text = element_text(size=16), axis.text.x = element_text(angle=45, hjust=1))
unordered_plot
# cancer.gene.type factors ordered according to the mean.percentile.by.type
ordered_plot <- mean_per_type %>%
ggplot(aes(x=fct_reorder(cancer.gene.type, mean.percentile.by.type), y = mean.percentile.by.type, colour = fct_reorder(cancer.gene.type, mean.percentile.by.type))) +
ggtitle("Ordered - Mean percentile for each cancer.gene.type factor") +
labs(x="cancer.gene.type ordered\nby mean.percentile.by.type") +
geom_point( size=6) + theme(
plot.title= element_text(color = "grey44", size=24, face="bold"),
text = element_text(size=16), axis.text.x = element_text(angle=45, hjust=1)) +
guides(colour=guide_legend(title="Ordered cancer.gene.type",
title.theme= element_text(angle=0, size=17, face="bold")))
ordered_plot
Note 1. I had initially printed the full length labels (e.g. “putative tumour suppressor” instead of pTS) and the labels were always outside the figure, not very practival !
We can also order the data.frame factors according to their frequency.
levels(d0$cancer.gene.type) %>% kable(format = "markdown", align="c")
## Warning in kable_markdown(x = structure(c("ONC", "ONC.pTS", "ONC.TS",
## "pONC", : The table should have a header (column names)
| ONC |
| ONC.pTS |
| ONC.TS |
| pONC |
| pONC.pTS |
| pONC.TS |
| pTS |
| TS |
| unknown |
d0$cancer.gene.type %>% forcats::fct_infreq() %>% levels() %>% kable(format = "markdown", align="c")
## Warning in kable_markdown(x = structure(c("unknown", "pTS", "ONC", "TS", :
## The table should have a header (column names)
| unknown |
| pTS |
| ONC |
| TS |
| pONC |
| ONC.pTS |
| ONC.TS |
| pONC.pTS |
| pONC.TS |
Note 1. So the most frequent factor is unknown, then pTS (putative tumour suppressor), then ONC (oncogene), etc.
We can also re-order the factors avg.TCGA.percentile according to their frequency:
d0$avg.TCGA.percentile <- factor(d0$avg.TCGA.percentile)
levels(d0$avg.TCGA.percentile) %>% head(5) %>% kable(format = "markdown", align="c")
## Warning in kable_markdown(x = structure(c("0", "1", "2", "3", "4"), .Dim =
## c(5L, : The table should have a header (column names)
| 0 |
| 1 |
| 2 |
| 3 |
| 4 |
d0$avg.TCGA.percentile %>% forcats::fct_infreq() %>% levels() %>% head(5) %>% kable(format = "markdown", align="c")
## Warning in kable_markdown(x = structure(c("49", "2", "1", "3", "48"), .Dim
## = c(5L, : The table should have a header (column names)
| 49 |
| 2 |
| 1 |
| 3 |
| 48 |
Note 1. The most frequent avg.TCGA.percentile factor is 49, therefore, we know that there is a high number of hugo genes with a gene expression level at the 49th percentile. After that, the second most frequent percentile value is 2, then 1, etc.
We will plot the avg.TCGA.percentile factors in un-ordered fashion.
d0 %>%
group_by(cancer.gene.type) %>%
ggplot(aes(x=avg.TCGA.percentile)) +
ggtitle("Count of hugo genes for each\navg.TCGA.percentile factor") +
labs(x="avg.TCGA.percentile") +
geom_bar(aes(fill=cancer.gene.type)) +
theme(
plot.title= element_text(color = "grey44", size=24, face="bold"),
text = element_text(size=12),
axis.text.x = element_text(angle=45, hjust=1))
Then we will plot the avg.TCGA.percentile in ordered fashion: from the highest count to lowest (and NA will be displayed last by default):
d0 %>%
group_by(cancer.gene.type) %>%
ggplot(aes(x=fct_infreq(avg.TCGA.percentile))) +
ggtitle("Ordered - Count of hugo genes for each\navg.TCGA.percentile factor") +
labs(x="Ordered factor - avg.TCGA.percentile") +
geom_bar(aes(fill=cancer.gene.type)) +
theme(
plot.title= element_text(color = "grey44", size=24, face="bold"),
text = element_text(size=12),
axis.text.x = element_text(angle=45, hjust=1))
Note 1. I know it’s a bit difficult to read all the factors, but I wanted to give an overview of individual percentile value as a factor, and in the next homework, I am hoping to learn how to group the values according to a range into bins to make more informative plots.
Resources:
Answer: As exemplified above, arrange() is not sufficient to re-order data according to a factor. For example, the hugo genes were not re-ordered according to the RPKM even after using arrange(RPKM), and the visual output is not easy to read/interprete when you don’t rearrange the data according to factors. We need to use functions of base R or forcats to do this (see examples above).
Answer: As mentioned previously, factors have to be treated a different way. When we are ploting numerical values according to numerical values, we can use arrange to re-order the data, but when we use factors, it requires extra-steps in order to go around “R who is trying to re-order your factors data for you (without your permission)”.
Answer: Please refer above for several tables, ways of counting and dropping levels and plots.
Resources: tidyverse and readr
In all honesty, I spent quite a bit of time on visual aspects of ggplot graph in in my homework 3 since I needed it for research-related problems at that time as well.
Therefore, I will try something different here: making a color scheme for the hugo genes based on the cancer.gene.type category they belong to. The entire code below is based on the work of Jenny Bryan, specifically tutorial on ggplot2 here (create color scheme for data) and here (plot the data).
In order to decide of the colour scheme needed, I need to know how many hugo genes there is in each cancer.gene.type category:
d2 <- d0 %>%
filter(cancer.gene.type != "unknown")
dim(d2)
## [1] 1281 27
d2.cancer.gene.type <- aggregate(hugo~cancer.gene.type, d2, function(x) length(unique(x)))
n.cancer.gene.type <- nrow(d2.cancer.gene.type)
n.cancer.gene.type
## [1] 8
d2.cancer.gene.type %>% kable(format = "markdown", align="c")
| cancer.gene.type | hugo |
|---|---|
| ONC | 303 |
| ONC.pTS | 46 |
| ONC.TS | 12 |
| pONC | 114 |
| pONC.pTS | 8 |
| pONC.TS | 3 |
| pTS | 667 |
| TS | 128 |
Note 1. I realized later on that it was not feasible to keep the unknown values, there were too many, so I had to filter them out.
Map cancer.gene.type into colours using the RColorBrewer package
display.brewer.all(type = "div")
color_anchors <-
list(
# unknown = brewer.pal(n=3, 'RdBu')[7:9], # Removed, causing issues
ONC = brewer.pal(n=10, 'RdYlBu')[1:4], #
ONC.pTS = brewer.pal(n=10, 'PuOr')[2:5], #
ONC.TS = brewer.pal(n=10, 'RdYlBu')[5:8], #
pONC = brewer.pal(n=10, 'PiYG')[1:4], #
pONC.pTS=brewer.pal(n=3,'BrBG')[3:4], #
pONC.TS = brewer.pal(n=3, 'PRGn')[3:4], #
pTS = brewer.pal(n=10, 'RdBu')[7:11], #
TS= brewer.pal(n=10, 'PiYG')[7:11])
Select the first or darkest color to represent the whole cancer.gene.type category
d2.cancer.gene.type$color <- lapply(color_anchors, "[",1 )
Expand into palettes big enough to cover each gene in a continent
library(plyr)
d2 <- data.frame(d2)
hugo_colors <- plyr::ddply(d2, ~cancer.gene.type, function(x) {
the.cancer.gene.type <- x$cancer.gene.type[1]
x <- droplevels(x)
hugo.lowtohigh <-
with(x, levels(reorder(hugo, RPKM)))
colorFun <- colorRampPalette(color_anchors[[the.cancer.gene.type]])
return(data.frame(hugo = I(hugo.lowtohigh),
color = I(colorFun(length(hugo.lowtohigh)))))
})
## fiddly parameters that control printing of hugo names
charLimit <- 12
xFudge <- 0.05
jCex <- 0.75
Note 1. ddply() is a function from the plyr package, as mentioned here
We can then store the figure making code as a function so can make pdf and png.
make_figure <- function() {
plot(c(0, n.cancer.gene.type), c(0,1), type = "n",
xlab = "", ylab="", xaxt = "n", yaxt = "n", bty = "n",
main="CHEK2 data - Cancer Gene Type - Color Scheme")
for(i in seq_len(n.cancer.gene.type)) {
this.cancer.gene.type <- d2.cancer.gene.type$cancer.gene.type[i]
nCols <- with(d2.cancer.gene.type, hugo[cancer.gene.type==this.cancer.gene.type])
yFudge = 0.1/nCols
foo <- seq(from = 0, to =1, length.out = nCols+1)
rect(xleft = i-1,
ybottom = foo[-(nCols + 1)],
xright = i,
ytop = foo[-1],
col=with(hugo_colors, color[cancer.gene.type==this.cancer.gene.type]))
text(x = i - 1 + xFudge,
y = foo[-(nCols + 1)] + yFudge,
labels = with(hugo_colors,
substr(hugo[cancer.gene.type==this.cancer.gene.type], 1, charLimit)),
adj = c(0, 0), cex = jCex)
}
mtext(d2.cancer.gene.type$cancer.gene.type, side = 3, at = seq_len(n.cancer.gene.type) - 0.5)
mtext(c("highest.RPKM", "smallest.RPKM"),
side = 2, at = c(0.9, 0.1), las = 1)
}
op <- par(mar = c(1, 4, 6, 1) + 0.1)
make_figure()
par(op)
png("scratch-space/chek2-cancer.gene.type-colors.png",
width = 30, height = 70, units = "in", res = 200)
op <- par(mar = c(1, 4, 6, 1) + 0.1)
make_figure()
dev.off()
## quartz_off_screen
## 2
par(op)
pdf("scratch-space/chek2-cancer.gene.type-colors.pdf",
width = 30, height = 70)
op <- par(mar = c(1, 4, 6, 1) + 0.1)
make_figure()
dev.off()
## quartz_off_screen
## 2
par(op)
write.table(hugo_colors, "scratch-space/chek2-hugo-colors.tsv",
quote = FALSE, sep = "\t", row.names = FALSE)
d2.cancer.gene.type$cancer.gene.type <- as.character(d2.cancer.gene.type$cancer.gene.type)
d2.cancer.gene.type$hugo <- as.character(d2.cancer.gene.type$hugo)
d2.cancer.gene.type$color <- as.character(d2.cancer.gene.type$color)
write.table(d2.cancer.gene.type, "scratch-space/chek2-cancer.gene.type-colors.tsv",
quote = FALSE, sep = "\t", row.names = FALSE)
Note 2. I had to convert the d2.cancer.gene.type table to characters before being able to write to a tsv file, as mentioned in this stack overflow discussion here.
Let’s write a figure to a file, then load it back to embed it in this report.
Merge color into data, as shown here
suppressWarnings(suppressMessages(library(plyr)))
hugo_colors <- read.delim("scratch-space/chek2-hugo-colors.tsv", colClasses = list(color = "character"), col.names = c("cancer.gene.type", "hugo", "hugo_color"))
## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist
#str(hugo_colors)
#insert color as a variable into d2
d2 <- merge(d2, hugo_colors)
So now we have added a column specifying a color for each gene:
d2 %>%
select(hugo, hugo_color) %>%
head(5) %>%
kable(format = "markdown", align="c")
| hugo | hugo_color |
|---|---|
| ABI1 | #F67B49 |
| ABI2 | #0E4179 |
| ABL1 | #F78950 |
| ABL2 | #D93529 |
| ACACA | #C51B7D |
Then we can use this to make plots, as Jenny Bryan showed here
d2 %>%
filter(cancer.gene.type != "unknown") %>%
group_by(cancer.gene.type) %>%
ggplot(aes(x=log2(RPKM), y=FC.mean.Bodymap, shape=copy.category), na.rm=TRUE) +
ggtitle("FC.mean.Bodymap as a function of log2(RPKM)") +
scale_size_continuous(range=c(1,40)) +
facet_wrap(~cancer.gene.type) +
aes(colour= hugo_color) + scale_colour_identity() +
theme_bw() + theme(
plot.title= element_text(color = "grey44", size=24, face="bold"),
text = element_text(size =18),
strip.text = element_text(size = rel(1.1))) +
geom_point()
ggsave("scratch-space/chek2_cancer_gene_type_color_plot.png", width = 30, height = 20, units = "cm", dpi = 300)
Note 1. In the plot above, one might want to focus on the genes that have low expression (low RPKM) and copy loss, or high expression and copy gain, as these paired-values are more likely to be biologically relevant.
a_plot
If you are generating two plots in the same code chunk, you need to specify which plot to save with ggsave, otherwise, only the last plot will be saved. For example, let’s take the same plots previously shown.
unordered_plot
ordered_plot
ggsave("scratch-space/some_plot.png", width = 30, height = 20, units = "cm", dpi = 300)
As you can see, the plot saved here is the ordered_plot because that was the last one in the code chunk, while unordered_plot has not been saved.
If you want to save all the plots in a code chunk, you have to add the ggsave() function after each plot code block, or you can put one ggsave() function for each plot (and specify the plot) at the end of the code chunk.
Put the ggsave function after each plot:
unordered_plot
ggsave("scratch-space/unordered.png", width = 30, height = 20, units = "cm", dpi = 300)
ordered_plot
ggsave("scratch-space/ordered.png", width = 30, height = 20, units = "cm", dpi = 300)
Or you can put the ggsave functions at the end:
unordered_plot
ordered_plot
ggsave("scratch-space/unordered.png", plot =unordered_plot, width = 30, height = 20, units = "cm", dpi = 300)
ggsave("scratch-space/ordered.png", plot =ordered_plot, width = 30, height = 20, units = "cm", dpi = 300)
And you can see that both plots were successfully saved: unordered_plot here and ordered_plot here !
Resources:
I have used some strategies to keep my repository clean and organized: